scom-cam colab

if (T) {
# load packages
library("tidyverse")
require(gridExtra)
library(grid)
library(viridis)
#library("quarto")
library("irr")
# clear workspace
rm(list=ls())
# load functions
source("../src/functions.R")
}
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: gridExtra


Attaching package: 'gridExtra'


The following object is masked from 'package:dplyr':

    combine


Loading required package: viridisLite

Loading required package: lpSolve

231110: clean dataset

# load dataset
#fn = "../csv/haidi-data-231012.csv"
fn = "../csv/haidi-data-231107.csv"
data = read.table(fn, sep='\t', header=T, strip.white=T, stringsAsFactors=F) |> as_tibble()

frequency, news source, year

if (T) {
#
result_df = data |> 
left_join(dt03, by=join_by(v03_news.source==V2)) |> 
mutate(cn=paste(v35_country, V3), cc=as.factor(v35_country), v36_date_year=str_extract(v02_date, "\\d{4}")) |>
group_by(cn, v36_date_year) |> 
summarize(count=n())
#
cat(simplermarkdown::md_table(result_df))
}
`summarise()` has grouped output by 'cn'. You can override using the `.groups`
argument.
cn v36_date_year count
dk EB 2017 1
dk FS 2017 1
dk FS 2018 2
dk FS 2019 3
dk FS 2021 3
dk FS 2022 1
dk JV 2017 2
dk JV 2018 1
dk JV 2019 1
dk JV 2021 1
dk JV 2022 2
dk NJ 2018 1
dk NJ 2020 1
dk NJ 2022 1
dk PO 2017 3
dk PO 2018 2
dk PO 2019 9
dk PO 2020 4
dk PO 2021 5
dk PO 2022 5
fi HS 2017 2
fi HS 2018 1
fi HS 2019 7
fi HS 2020 3
fi HS 2021 2
fi IS 2017 1
fi IS 2020 2
fi IS 2022 1
fi K 2017 1
fi K 2019 1
fi K 2020 4
fi K 2021 3
fi K 2022 5
fi LK 2017 3
fi LK 2020 2
fi LK 2021 5
fi LK 2022 5
fi TS 2017 5
fi TS 2018 1
fi TS 2019 6
fi TS 2020 5
fi TS 2021 3
fi TS 2022 8
se AB 2017 2
se AB 2018 2
se AB 2019 1
se AB 2020 2
se AB 2021 1
se AB 2022 1
se DN 2017 1
se DN 2018 3
se DN 2019 3
se DN 2020 1
se DN 2021 4
se DN 2022 1
se GP 2017 4
se GP 2018 3
se GP 2019 2
se GP 2020 1
se GP 2021 1
se GP 2022 1
se SDS 2018 1
se SDS 2019 2
se SDS 2020 3
se SDS 2021 3
se SDS 2022 1
se VK 2017 2
se VK 2018 2
se VK 2019 7
se VK 2020 2
se VK 2021 2
se VK 2022 1
if (F) {
#
result_df = data |> 
mutate(v36_date_year = str_extract(v02_date, "\\d{4}")) |> 
select(v35_country, v36_date_year)
#
cat(simplermarkdown::md_table(result_df))
}
pd = data |> group_by(v04_article.type) |> summarize(mean=mean(v05_article.size)) |> left_join(dt04, by=join_by(v04_article.type==V2))

news sources

#
pd = data |> group_by(v03_news.source, v35_country) |> summarize(count=n()) |> 
left_join(dt03, by=join_by(v03_news.source==V2)) |> 
mutate(cn=paste(v35_country, V3), cc=as.factor(v35_country))
`summarise()` has grouped output by 'v03_news.source'. You can override using
the `.groups` argument.
p1 = ggplot(pd, aes(x=v35_country, y=count, fill=cn)) +
    geom_bar(stat="identity", position=position_dodge()) + 
    scale_fill_discrete(name=pd$V1[1]) + 
    labs(y="count", x="v35_country", title="v03_news.source by v35_country")

#    scale_fill_discrete(name=pd$V1[1], labels=V3)
#    scale_fill_manual(values=pd$cx)
#    scale_color_viridis(discrete=T) +
#p1 + scale_fill_brewer(colorRampPalette(brewer.pal(9,"YlOrRd"))(50))
#p1 + scale_fill_manual(values=colorRampPalette(brewer.pal(9,"Spectral"))(nrow(unique(pd[,1]))))
p1 + scale_fill_manual(values=viridis(nrow(unique(pd[,1]))))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

frequency charts

# get graphs
for (i in c(4:9,12:15,18:21,22:32,34)) {
pd = desc_get(data, get(paste0("dt",sprintf("%02d", i))), i, var_pl=T)

# get table
print(pd[2])
#
write.table(pd[2], paste0("../tmp/haidi-table-v", sprintf("%02d", i), ".csv"), sep="\t", quot=T, row.names=F)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
[[1]]
# A tibble: 10 × 9
# Groups:   v35_country [3]
   v35_country v04_article.type count   prop prop_lab count_…¹ V1       V2 V3   
   <chr>                  <int> <int>  <dbl>    <dbl>    <dbl> <chr> <int> <chr>
 1 dk                         1    26 0.531    0.735      36   Arti…     1 Opin…
 2 dk                         2     7 0.143    0.398      19.5 Arti…     2 News 
 3 dk                         3     7 0.143    0.255      12.5 Arti…     3 Feat…
 4 dk                         4     9 0.184    0.0918      4.5 Arti…     4 Other
 5 fi                         1    59 0.776    0.612      46.5 Arti…     1 Opin…
 6 fi                         2    14 0.184    0.132      10   Arti…     2 News 
 7 fi                         3     3 0.0395   0.0197      1.5 Arti…     3 Feat…
 8 se                         1    14 0.233    0.883      53   Arti…     1 Opin…
 9 se                         2    34 0.567    0.483      29   Arti…     2 News 
10 se                         3    12 0.2      0.1         6   Arti…     3 Feat…
# … with abbreviated variable name ¹​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 9 × 9
# Groups:   v35_country [3]
  v35_country v05_article.size count   prop prop_lab count_lab V1       V2 V3   
  <chr>                  <int> <int>  <dbl>    <dbl>     <dbl> <chr> <int> <chr>
1 dk                         1     7 0.143    0.929       45.5 Arti…     1 Small
2 dk                         2    19 0.388    0.663       32.5 Arti…     2 Medi…
3 dk                         3    23 0.469    0.235       11.5 Arti…     3 Large
4 fi                         1    47 0.618    0.691       52.5 Arti…     1 Small
5 fi                         2    24 0.316    0.224       17   Arti…     2 Medi…
6 fi                         3     5 0.0658   0.0329       2.5 Arti…     3 Large
7 se                         1     9 0.15     0.925       55.5 Arti…     1 Small
8 se                         2    10 0.167    0.767       46   Arti…     2 Medi…
9 se                         3    41 0.683    0.342       20.5 Arti…     3 Large
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 9 × 9
# Groups:   v35_country [3]
  v35_country v06_illustrated count   prop prop_lab count_lab V1        V2 V3   
  <chr>                 <int> <int>  <dbl>    <dbl>     <dbl> <chr>  <int> <chr>
1 dk                        0    14 0.286    0.857       42   Illus…     0 No   
2 dk                        1    23 0.469    0.480       23.5 Illus…     1 Yes,…
3 dk                        2    12 0.245    0.122        6   Illus…     2 Yes …
4 fi                        0    49 0.645    0.678       51.5 Illus…     0 No   
5 fi                        1    25 0.329    0.191       14.5 Illus…     1 Yes,…
6 fi                        2     2 0.0263   0.0132       1   Illus…     2 Yes …
7 se                        0     3 0.0508   0.975       57.5 Illus…     0 No   
8 se                        1    52 0.881    0.508       30   Illus…     1 Yes,…
9 se                        2     4 0.0678   0.0339       2   Illus…     2 Yes …
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 11 × 9
# Groups:   v35_country [3]
   v35_country v07_type.of.illu…¹ count   prop prop_…² count…³ V1       V2 V3   
   <chr>                    <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
 1 dk                           1    15 0.429  0.786      27.5 Type…     1 Phot…
 2 dk                           2     3 0.0857 0.529      18.5 Type…     2 Phot…
 3 dk                           3     9 0.257  0.357      12.5 Type…     3 Cart…
 4 dk                           4     6 0.171  0.143       5   Type…     4 Mix …
 5 dk                           5     2 0.0571 0.0286      1   Type…     5 Mix …
 6 fi                           1    14 0.538  0.731      19   Type…     1 Phot…
 7 fi                           2    12 0.462  0.231       6   Type…     2 Phot…
 8 se                           1    38 0.679  0.661      37   Type…     1 Phot…
 9 se                           2    15 0.268  0.187      10.5 Type…     2 Phot…
10 se                           4     2 0.0357 0.0357      2   Type…     4 Mix …
11 se                           5     1 0.0179 0.00893     0.5 Type…     5 Mix …
# … with abbreviated variable names ¹​v07_type.of.illustration, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v08_target.group.…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            0    27 0.931   0.534     15.5 Targ…     0 No   
2 dk                            1     2 0.0690  0.0345     1   Targ…     1 Yes  
3 fi                            0     4 0.267   0.867     13   Targ…     0 No   
4 fi                            1    11 0.733   0.367      5.5 Targ…     1 Yes  
5 se                            0    39 0.696   0.652     36.5 Targ…     0 No   
6 se                            1    17 0.304   0.152      8.5 Targ…     1 Yes  
# … with abbreviated variable names ¹​v08_target.group.in.illustration,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v09_agency.of.tar…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            0     1 0.5      0.75      1.5 Agen…     0 Pass…
2 dk                            1     1 0.5      0.25      0.5 Agen…     1 Acti…
3 fi                            0     1 0.0909   0.955    10.5 Agen…     0 Pass…
4 fi                            1    10 0.909    0.455     5   Agen…     1 Acti…
5 se                            0     7 0.412    0.794    13.5 Agen…     0 Pass…
6 se                            1    10 0.588    0.294     5   Agen…     1 Acti…
# … with abbreviated variable names
#   ¹​v09_agency.of.target.group.in.illustration, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v12_heading.conte…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            0    42 0.857   0.571     28   Head…     0 No   
2 dk                            1     7 0.143   0.0714     3.5 Head…     1 Yes  
3 fi                            0    67 0.882   0.559     42.5 Head…     0 No   
4 fi                            1     9 0.118   0.0592     4.5 Head…     1 Yes  
5 se                            0    56 0.933   0.533     32   Head…     0 No   
6 se                            1     4 0.0667  0.0333     2   Head…     1 Yes  
# … with abbreviated variable names ¹​v12_heading.content.health, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v13_heading.conte…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            0    46 0.939   0.531     26   Head…     0 No   
2 dk                            1     3 0.0612  0.0306     1.5 Head…     1 Yes  
3 fi                            0    57 0.75    0.625     47.5 Head…     0 No   
4 fi                            1    19 0.25    0.125      9.5 Head…     1 Yes  
5 se                            0    44 0.733   0.633     38   Head…     0 No   
6 se                            1    16 0.267   0.133      8   Head…     1 Yes  
# … with abbreviated variable names ¹​v13_heading.content.old, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v14_heading.conten…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    34 0.694   0.653    32   Head…     0 No   
2 dk                             1    15 0.306   0.153     7.5 Head…     1 Yes  
3 fi                             0    48 0.632   0.684    52   Head…     0 No   
4 fi                             1    28 0.368   0.184    14   Head…     1 Yes  
5 se                             0    44 0.733   0.633    38   Head…     0 No   
6 se                             1    16 0.267   0.133     8   Head…     1 Yes  
# … with abbreviated variable names ¹​v14_heading.content.digital.tekn,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 5 × 9
# Groups:   v35_country [3]
  v35_country v15_heading.conte…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            0    49 1      0.5        24.5 Head…     0 No   
2 fi                            0    75 0.987  0.507      38.5 Head…     0 No   
3 fi                            1     1 0.0132 0.00658     0.5 Head…     1 Yes  
4 se                            0    55 0.917  0.542      32.5 Head…     0 No   
5 se                            1     5 0.0833 0.0417      2.5 Head…     1 Yes  
# … with abbreviated variable names ¹​v15_heading.content.ill, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 28 × 9
# Groups:   v35_country [3]
   v35_country v18_article.cont…¹ count   prop prop_…² count…³ V1       V2 V3   
   <chr>                    <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
 1 dk                           1     6 0.122    0.939    46   Arti…     1 Medi…
 2 dk                           2     1 0.0204   0.867    42.5 Arti…     2 Scie…
 3 dk                           3    14 0.286    0.714    35   Arti…     3 Poli…
 4 dk                           4     1 0.0204   0.561    27.5 Arti…     4 Busi…
 5 dk                           5     1 0.0204   0.541    26.5 Arti…     5 Educ…
 6 dk                           6     1 0.0204   0.520    25.5 Arti…     6 Work…
 7 dk                           8     3 0.0612   0.480    23.5 Arti…     8 Leis…
 8 dk                           9     1 0.0204   0.439    21.5 Arti…     9 Ente…
 9 dk                          10     8 0.163    0.347    17   Arti…    10 Heal…
10 dk                          12    13 0.265    0.133     6.5 Arti…    12 Othe…
# … with 18 more rows, and abbreviated variable names
#   ¹​v18_article.content.dominant.theme, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 20 × 9
# Groups:   v35_country [3]
   v35_country v19_subject.posi…¹ count   prop prop_…² count…³ V1       V2 V3   
   <chr>                    <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
 1 dk                           1    24 0.774   0.613     19   Subj…     1 olde…
 2 dk                           3     3 0.0968  0.177      5.5 Subj…     3 Pati…
 3 dk                           6     1 0.0323  0.113      3.5 Subj…     6 Prob…
 4 dk                           9     3 0.0968  0.0484     1.5 Subj…     9 Mix …
 5 fi                           1    59 0.787   0.607     45.5 Subj…     1 olde…
 6 fi                           2     2 0.0267  0.2       15   Subj…     2 Citi…
 7 fi                           3     2 0.0267  0.173     13   Subj…     3 Pati…
 8 fi                           4     2 0.0267  0.147     11   Subj…     4 User…
 9 fi                           8     2 0.0267  0.12       9   Subj…     8 Othe…
10 fi                           9     5 0.0667  0.0733     5.5 Subj…     9 Mix …
11 fi                          11     3 0.04    0.02       1.5 Subj…    11 Unde…
12 se                           1    40 0.667   0.667     40   Subj…     1 olde…
13 se                           2     2 0.0333  0.317     19   Subj…     2 Citi…
14 se                           3     5 0.0833  0.258     15.5 Subj…     3 Pati…
15 se                           4     1 0.0167  0.208     12.5 Subj…     4 User…
16 se                           5     4 0.0667  0.167     10   Subj…     5 Risk…
17 se                           6     2 0.0333  0.117      7   Subj…     6 Prob…
18 se                           7     2 0.0333  0.0833     5   Subj…     7 Expe…
19 se                           8     2 0.0333  0.05       3   Subj…     8 Othe…
20 se                           9     2 0.0333  0.0167     1   Subj…     9 Mix …
# … with abbreviated variable names ¹​v19_subject.positioning, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 5 × 9
# Groups:   v35_country [3]
  v35_country v20_agency.of.targ…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    49 1      0.5       24.5 Agen…     0 No   
2 fi                             0    62 0.816  0.592     45   Agen…     0 No   
3 fi                             1    14 0.184  0.0921     7   Agen…     1 Yes  
4 se                             0    53 0.883  0.558     33.5 Agen…     0 No   
5 se                             1     7 0.117  0.0583     3.5 Agen…     1 Yes  
# … with abbreviated variable names
#   ¹​v20_agency.of.target.group.in.article.voice, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v21_gender.visibl…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            0    48 0.980   0.510     25   Gend…     0 No   
2 dk                            1     1 0.0204  0.0102     0.5 Gend…     1 Yes  
3 fi                            0    62 0.816   0.592     45   Gend…     0 No   
4 fi                            1    14 0.184   0.0921     7   Gend…     1 Yes  
5 se                            0    47 0.783   0.608     36.5 Gend…     0 No   
6 se                            1    13 0.217   0.108      6.5 Gend…     1 Yes  
# … with abbreviated variable names ¹​v21_gender.visible.mentioned, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v22_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    24 0.490   0.755    37   Acto…     0 No   
2 dk                             1    25 0.510   0.255    12.5 Acto…     1 Yes  
3 fi                             0    47 0.618   0.691    52.5 Acto…     0 No   
4 fi                             1    29 0.382   0.191    14.5 Acto…     1 Yes  
5 se                             0    44 0.733   0.633    38   Acto…     0 No   
6 se                             1    16 0.267   0.133     8   Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v22_actors.mentioned.given.voice.state.gov, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v23_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    37 0.755   0.622    30.5 Acto…     0 No   
2 dk                             1    12 0.245   0.122     6   Acto…     1 Yes  
3 fi                             0    42 0.553   0.724    55   Acto…     0 No   
4 fi                             1    34 0.447   0.224    17   Acto…     1 Yes  
5 se                             0    40 0.667   0.667    40   Acto…     0 No   
6 se                             1    20 0.333   0.167    10   Acto…     1 Yes  
# … with abbreviated variable names ¹​v23_actors.mentioned.given.voice.region,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v24_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    24 0.490   0.755    37   Acto…     0 No   
2 dk                             1    25 0.510   0.255    12.5 Acto…     1 Yes  
3 fi                             0    58 0.763   0.618    47   Acto…     0 No   
4 fi                             1    18 0.237   0.118     9   Acto…     1 Yes  
5 se                             0    43 0.717   0.642    38.5 Acto…     0 No   
6 se                             1    17 0.283   0.142     8.5 Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v24_actors.mentioned.given.voice.municipality, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v25_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    29 0.592   0.704    34.5 Acto…     0 No   
2 dk                             1    20 0.408   0.204    10   Acto…     1 Yes  
3 fi                             0    47 0.618   0.691    52.5 Acto…     0 No   
4 fi                             1    29 0.382   0.191    14.5 Acto…     1 Yes  
5 se                             0    45 0.75    0.625    37.5 Acto…     0 No   
6 se                             1    15 0.25    0.125     7.5 Acto…     1 Yes  
# … with abbreviated variable names ¹​v25_actors.mentioned.given.voice.agency,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v26_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    26 0.531  0.735     36   Acto…     0 No   
2 dk                             1    23 0.469  0.235     11.5 Acto…     1 Yes  
3 fi                             0    62 0.816  0.592     45   Acto…     0 No   
4 fi                             1    14 0.184  0.0921     7   Acto…     1 Yes  
5 se                             0    45 0.75   0.625     37.5 Acto…     0 No   
6 se                             1    15 0.25   0.125      7.5 Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v26_actors.mentioned.given.voice.politician.party, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v27_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    23 0.469   0.765    37.5 Acto…     0 No   
2 dk                             1    26 0.531   0.265    13   Acto…     1 Yes  
3 fi                             0    43 0.566   0.717    54.5 Acto…     0 No   
4 fi                             1    33 0.434   0.217    16.5 Acto…     1 Yes  
5 se                             0    43 0.717   0.642    38.5 Acto…     0 No   
6 se                             1    17 0.283   0.142     8.5 Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v27_actors.mentioned.given.voice.physician.nurse.health.staff, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v28_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    27 0.551   0.724    35.5 Acto…     0 No   
2 dk                             1    22 0.449   0.224    11   Acto…     1 Yes  
3 fi                             0    35 0.461   0.770    58.5 Acto…     0 No   
4 fi                             1    41 0.539   0.270    20.5 Acto…     1 Yes  
5 se                             0    36 0.6     0.7      42   Acto…     0 No   
6 se                             1    24 0.4     0.2      12   Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v28_actors.mentioned.given.voice.organ.health.care.service, ²​prop_lab,
#   ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v29_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    29 0.592   0.704    34.5 Acto…     0 No   
2 dk                             1    20 0.408   0.204    10   Acto…     1 Yes  
3 fi                             0    51 0.671   0.664    50.5 Acto…     0 No   
4 fi                             1    25 0.329   0.164    12.5 Acto…     1 Yes  
5 se                             0    43 0.717   0.642    38.5 Acto…     0 No   
6 se                             1    17 0.283   0.142     8.5 Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v29_actors.mentioned.given.voice.scientist, ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v30_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    39 0.796   0.602    29.5 Acto…     0 No   
2 dk                             1    10 0.204   0.102     5   Acto…     1 Yes  
3 fi                             0    59 0.776   0.612    46.5 Acto…     0 No   
4 fi                             1    17 0.224   0.112     8.5 Acto…     1 Yes  
5 se                             0    40 0.667   0.667    40   Acto…     0 No   
6 se                             1    20 0.333   0.167    10   Acto…     1 Yes  
# … with abbreviated variable names ¹​v30_actors.mentioned.given.voice.ngo,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v31_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    31 0.633   0.684    33.5 Acto…     0 No   
2 dk                             1    18 0.367   0.184     9   Acto…     1 Yes  
3 fi                             0    58 0.763   0.618    47   Acto…     0 No   
4 fi                             1    18 0.237   0.118     9   Acto…     1 Yes  
5 se                             0    44 0.733   0.633    38   Acto…     0 No   
6 se                             1    16 0.267   0.133     8   Acto…     1 Yes  
# … with abbreviated variable names ¹​v31_actors.mentioned.given.voice.industry,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 6 × 9
# Groups:   v35_country [3]
  v35_country v32_actors.mention…¹ count  prop prop_…² count…³ V1       V2 V3   
  <chr>                      <int> <int> <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                             0    13 0.265   0.867    42.5 Acto…     0 No   
2 dk                             1    36 0.735   0.367    18   Acto…     1 Yes  
3 fi                             0    38 0.5     0.75     57   Acto…     0 No   
4 fi                             1    38 0.5     0.25     19   Acto…     1 Yes  
5 se                             0    39 0.65    0.675    40.5 Acto…     0 No   
6 se                             1    21 0.35    0.175    10.5 Acto…     1 Yes  
# … with abbreviated variable names
#   ¹​v32_actors.mentioned.given.voice.other.citizen.family.relatives,
#   ²​prop_lab, ³​count_lab
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

[[1]]
# A tibble: 8 × 9
# Groups:   v35_country [3]
  v35_country v34_tonality.of.a…¹ count   prop prop_…² count…³ V1       V2 V3   
  <chr>                     <int> <int>  <dbl>   <dbl>   <dbl> <chr> <int> <chr>
1 dk                            1    22 0.449   0.776     38   Tona…     1 Prim…
2 dk                            2    24 0.490   0.306     15   Tona…     2 Prim…
3 dk                            3     3 0.0612  0.0306     1.5 Tona…     3 Can …
4 fi                            1    26 0.342   0.829     63   Tona…     1 Prim…
5 fi                            2    33 0.434   0.441     33.5 Tona…     2 Prim…
6 fi                            3    17 0.224   0.112      8.5 Tona…     3 Can …
7 se                            1    20 0.333   0.833     50   Tona…     1 Prim…
8 se                            2    40 0.667   0.333     20   Tona…     2 Prim…
# … with abbreviated variable names ¹​v34_tonality.of.article, ²​prop_lab,
#   ³​count_lab
cat ../tmp/haidi-table-* > ../tmp/haidi-tables.csv

index variable

#
pd = data |> select(35,33) |> rename(v35=1, v33=2)
p1 = ggplot(pd, aes(x=v35, y=v33, fill=v35)) +
    geom_boxplot(alpha=1.0) +
    stat_summary(fun=mean) +
    labs(y="v33_power.sum.index", x="v35_country", title="v33_power.sum.index by v35_country") + 
    theme(legend.position="none")

#
p1
Warning: Removed 3 rows containing missing values (`geom_segment()`).

#
#dt03 |> as_tibble()

# load dataset, rename cols, country
fn = "../csv/haidi-data-231012.csv"
#fn = "../csv/haidi-data-231107.csv"
#data = read.table(fn, sep='\t', quote="", header=F, strip.white=TRUE, stringsAsFactors=FALSE) |> as_tibble() |> rename_with(~ cn, all_of(paste0(rep("V",34), seq(1,34))))
data = read.table(fn, sep='\t', header=T, strip.white=T, stringsAsFactors=F) |> as_tibble()

# add country variable
#data = data |> 
#mutate(v35_country = as.numeric(str_extract(data$v03_news.source, "^."))) |> 
#mutate(v35_country = ifelse(v35_country==1,"se", ifelse(v35_country==2,"dk", ifelse(v35_country==3,"fi",NA))))

# clean data
data[,18:21] = data[,18:21] |> map(~str_extract(., "^\\d+") |> as.numeric()) |> as_tibble()
data = data |> rename(v33_power.sum.index=33)

# 
#write.table(data, fn, sep="\t", quot=T, row.names=F)
  • combined dataset link
  • combined figures link

231025: color theme

library(RColorBrewer)
#display.brewer.all()

# custom theme
some_graph <- theme(panel.grid.major=element_line(linewidth=2))
some_color <- c("deeppink", "chartreuse", "midnightblue")
# put the elements in a list
theme_haidi <- list(some_graph, scale_color_manual(values=some_color))
theme_haidi <- list(some_graph, scale_colour_brewer(palette="Blues"))

231019: descriptives

pd = data |> group_by(v03_news.source, v35_country) |> summarize(count=n()) |> 
left_join(dt03, by=join_by(v03_news.source==V2)) |> 
mutate(cn=paste(v35_country, V3), cc=as.factor(v35_country))
`summarise()` has grouped output by 'v03_news.source'. You can override using
the `.groups` argument.
p1 = ggplot(pd, aes(x=v35_country, y=count, fill=cn)) +
    geom_bar(stat="identity", position=position_dodge()) + 
    scale_fill_discrete(name=pd$V1[1]) + 
    labs(y="count", x="v35_country", title="v03_news.source by v35_country")

#    scale_fill_discrete(name=pd$V1[1], labels=V3)
#    scale_fill_manual(values=pd$cx)
#    scale_color_viridis(discrete=T) +
#p1 + scale_fill_brewer(colorRampPalette(brewer.pal(9,"YlOrRd"))(50))
#p1 + scale_fill_manual(values=colorRampPalette(brewer.pal(9,"Spectral"))(nrow(unique(pd[,1]))))
p1 + scale_fill_manual(values=viridis(nrow(unique(pd[,1]))))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

pd = data |> select(35,33) |> rename(v35=1, v33=2)
p1 = ggplot(pd, aes(x=v35, y=v33, fill=v35)) +
    geom_boxplot(alpha=1.0) +
    stat_summary(fun=mean) +
    labs(y="v33_power.sum.index", x="v35_country", title="v33_power.sum.index by v35_country") + 
    theme(legend.position="none")

#
p1
Warning: Removed 3 rows containing missing values (`geom_segment()`).

for (i in 33:34) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

231018: descriptives

for (i in 22:32) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

for (i in 18:21) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

231017: descriptives

for (i in 12:15) {
pl = desc_get(data, get(paste0("dt",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

231016: descriptives

for (i in 6:9) {
pl = desc_get(data, get(paste0("dt0",i)), i)
}
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

pl = desc_get(data, dt04, 4)
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.

#grid.arrange(pl[1], pl[2], ncol=2, widths=c(3, 4))

231012: descriptives

# plot data, grouped
pdg = data |> 
group_by(v35_country, v05_article.size) |> 
summarize(count=n()) |> 
mutate(prop=count/sum(count)) |> 
mutate(prop_lab=rev(cumsum(rev(prop)))-prop/2) |> 
mutate(count_lab=rev(cumsum(rev(count)))-count/2) |> 
left_join(dt05 |> as_tibble() |> arrange(V2), by=join_by(v05_article.size==V2), keep=T) 
`summarise()` has grouped output by 'v35_country'. You can override using the
`.groups` argument.
#
p1 = ggplot(data=pdg, aes(fill=as.factor(V2), y=count, x=v35_country)) + 
    geom_bar(position="fill", stat="identity") +
    scale_y_continuous(labels=scales::percent) +
#    geom_text(aes(label=paste0(prop*100,"%")), position=position_stack(vjust=0.5), size=2)
    geom_text(aes(label=paste0(round(100*prop,1),"%"), y=prop_lab), size=3) +
    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3)

#
p2 = ggplot(data=pdg, aes(fill=as.factor(V2), y=count, x=v35_country)) + 
    geom_bar(position="stack", stat="identity") +
    geom_text(aes(label=count, y=count_lab), size=3) +
    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3)

#
p3 = ggplot(data=pdg, aes(fill=as.factor(V2), y=count, x=v35_country)) + 
    geom_bar(position="stack", stat="identity") +
#    geom_text(aes(label=count, y=count_lab), size=3) +
    geom_text(aes(label=paste0(round(100*prop,1),"%"), y=count_lab), size=3) +
#    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3)
    scale_fill_discrete(guide="none")

# plot data, combined
pdc = data |> 
group_by(v05_article.size) |> 
summarize(count=n()) |> 
mutate(prop=count/sum(count)) |> 
mutate(prop_lab=rev(cumsum(rev(prop)))-prop/2) |> 
mutate(count_lab=rev(cumsum(rev(count)))-count/2) |> 
left_join(dt05 |> as_tibble(), by=join_by(v05_article.size==V2), keep=T) 

#
p4 = ggplot(data=pdc, aes(fill=as.factor(V2), y=count, x="combined")) + 
    geom_bar(position="fill", stat="identity") +
    scale_y_continuous(labels=scales::percent) +
    geom_text(aes(label=paste0(round(100*prop,1),"%"), y=prop_lab), size=3) +
    scale_fill_discrete(name=dt05$V1[1], labels=dt05$V3) +
    labs(y="percentage", x="v35_country")

#par(mfrow = c(1,2))
#pdf("../fig/foo.pdf")#png()
grid.arrange(p4, p3, ncol=2, widths=c(3, 4))

#dev.off()
  • combined dataset link

231009: dk icr re-test

# load dataset
fn = "../csv/haidi-wp1-coding-dk-2.tsv"
#data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE, quote="")
data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE) |> as_tibble()

# select data
data = data |> 
na.omit() |> 
select(-c(2)) |> 
select(last_col(), 1:29) |> 
rename(coder_id=V31, content_id=V1)
# content_id for both coders
data$some = data$content_id
# clean data
#data$V3 = data$V3 |> str_extract("\\d+") |> as.numeric()

analyze data

# 
write.table(data, "../csv/haidi-dk-2.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_DK001 V3_DK002 V3_DK003 V3_DK004 V3_DK005 V3_DK006 V3_DK007 V3_DK…¹ V3_DK…²
     <int>    <int>    <int>    <int>    <int>    <int>    <int>   <int>   <int>
1      208      206      208      206      206      206      206     206     206
2      208      206      208      206      206      206      206     206     206
# … with 271 more variables: V3_DK010 <int>, V4_DK001 <int>, V4_DK002 <int>,
#   V4_DK003 <int>, V4_DK004 <int>, V4_DK005 <int>, V4_DK006 <int>,
#   V4_DK007 <int>, V4_DK008 <int>, V4_DK009 <int>, V4_DK010 <int>,
#   V5_DK001 <int>, V5_DK002 <int>, V5_DK003 <int>, V5_DK004 <int>,
#   V5_DK005 <int>, V5_DK006 <int>, V5_DK007 <int>, V5_DK008 <int>,
#   V5_DK009 <int>, V5_DK010 <int>, V6_DK001 <int>, V6_DK002 <int>,
#   V6_DK003 <int>, V6_DK004 <int>, V6_DK005 <int>, V6_DK006 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.704 
# write data
fn = "../csv/haidi-wp1-coding-dk-2-wide.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

231004: all

# bash code chunk
head -n1 ../csv/haidi-all.tsv | tr '\t' '\n' | cat -n | head
     1  "coder_id"
     2  "content_id"
     3  "V3"
     4  "V4"
     5  "V5"
     6  "V6"
     7  "V7"
     8  "V8"
     9  "V9"
    10  "V12"
# load datasets
fn = "../csv/haidi-all.tsv"
data = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE) |> as_tibble()

data |> group_by(content_id, coder_id) |> summarize(count=n())
`summarise()` has grouped output by 'content_id'. You can override using the
`.groups` argument.
# A tibble: 60 × 3
# Groups:   content_id [40]
   content_id coder_id count
   <chr>      <chr>    <int>
 1 DK001      A            1
 2 DK001      B            1
 3 DK002      A            1
 4 DK002      B            1
 5 DK003      A            1
 6 DK003      B            1
 7 DK004      A            1
 8 DK004      B            1
 9 DK005      A            1
10 DK005      B            1
# … with 50 more rows
data |> group_by(V4) |> summarize(count=n())
# A tibble: 4 × 2
     V4 count
  <int> <int>
1     1    44
2     2    10
3     3     3
4     4     3
data |> group_by(V4) |> summarize(count=n()) |>
ggplot(aes(x=V4, y=count)) +
#  geom_bar(fill="green", stat="identity") + 
  geom_bar(stat="identity") + 
  theme_minimal()

data |> group_by(V4) |> summarize(count=n()) |> arrange(desc(V4)) |> mutate(prop=round(count*100/sum(count), 1), lab.ypos=cumsum(prop) - 0.5*prop)
# A tibble: 4 × 4
     V4 count  prop lab.ypos
  <int> <int> <dbl>    <dbl>
1     4     3   5        2.5
2     3     3   5        7.5
3     2    10  16.7     18.4
4     1    44  73.3     63.4
data |> group_by(V4) |> summarize(count=n()) |> arrange(desc(V4)) |> mutate(prop=round(count*100/sum(count), 1), lab.ypos=cumsum(prop) - 0.5*prop) |> 
ggplot(aes(x="", y=prop, fill=V4)) +
  geom_bar(width=1, stat="identity", color="white") +
  geom_text(aes(y=lab.ypos, label=prop), color="white") +
  coord_polar("y", start=0) +
  theme_minimal()

231003: all datasets

# load datasets
fn = "../csv/haidi-dk.tsv"
data_dk = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)
fn = "../csv/haidi-fi.tsv"
data_fi = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)
fn = "../csv/haidi-se.tsv"
data_se = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)

data = rbind(data_dk, data_fi, data_se)
data |> as_tibble()
# A tibble: 60 × 31
   coder_id conten…¹    V3    V4    V5    V6    V7    V8    V9   V12   V13   V14
   <chr>    <chr>    <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 A        DK001      208     4     3     2     1     0    99     0     0     0
 2 A        DK002      206     4     3     2     4     0    99     0     0     0
 3 A        DK003      208     1     2     0    99     0    99     0     0     0
 4 A        DK004      206     1     3     1     3     0    99     0     0     1
 5 A        DK005      206     1     2     0    99     0    99     0     0     0
 6 A        DK006      206     1     3     1     3     0    99     0     0     0
 7 A        DK007      206     1     3     0    99     0    99     0     0     0
 8 A        DK008      206     1     2     1     3     0    99     0     0     0
 9 A        DK009      206     1     2     1     3     0    99     1     0     0
10 A        DK010      206     1     1     0    99    99    99     0     0     0
# … with 50 more rows, 19 more variables: V15 <int>, V18 <int>, V19 <int>,
#   V20 <int>, V21 <int>, V22 <int>, V23 <int>, V24 <int>, V25 <int>,
#   V26 <int>, V27 <int>, V28 <int>, V29 <int>, V30 <int>, V31 <int>,
#   V32 <int>, V33 <int>, V34 <int>, some <chr>, and abbreviated variable name
#   ¹​content_id
# 
write.table(data, "../csv/haidi-all.tsv", sep="\t", quot=T, row.names=F)

230525: finnish dataset

# load dataset
fn = "../csv/haidi-wp1-coding-fi.tsv"
#data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE)
data = read.table(fn, sep='\t', header=F, quote="", strip.white=TRUE, stringsAsFactors=FALSE)
# select data
data = data |> 
as_tibble() |> 
na.omit() |> 
select(-c(2,10,11,16,17)) |> 
select(last_col(), 1:29) |> 
rename(coder_id=V35, content_id=V1)
# content_id for both coders
data$some = data$content_id
# clean data
data$V3 = data$V3 |> str_extract("\\d+") |> as.numeric()

analyze data

# 
write.table(data, "../csv/haidi-fi.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_FI001 V3_FI002 V3_FI003 V3_FI004 V3_FI005 V3_FI006 V3_FI007 V3_FI…¹ V3_FI…²
     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
1      315      311      311      313      314      315      315     314     315
2      315      311      311      313      314      315      315     314     315
# … with 271 more variables: V3_FI010 <dbl>, V4_FI001 <int>, V4_FI002 <int>,
#   V4_FI003 <int>, V4_FI004 <int>, V4_FI005 <int>, V4_FI006 <int>,
#   V4_FI007 <int>, V4_FI008 <int>, V4_FI009 <int>, V4_FI010 <int>,
#   V5_FI001 <int>, V5_FI002 <int>, V5_FI003 <int>, V5_FI004 <int>,
#   V5_FI005 <int>, V5_FI006 <int>, V5_FI007 <int>, V5_FI008 <int>,
#   V5_FI009 <int>, V5_FI010 <int>, V6_FI001 <int>, V6_FI002 <int>,
#   V6_FI003 <int>, V6_FI004 <int>, V6_FI005 <int>, V6_FI006 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.72 
# write data
fn = "../csv/haidi-wp1-coding-fi-wide.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

230523: danish dataset

# load dataset
fn = "../csv/haidi-wp1-coding-dk.tsv"
data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE)
data = read.table(fn, sep='\t', header=F, quote="", strip.white=TRUE, stringsAsFactors=FALSE)
# select data
data = data |> 
as_tibble() |> 
na.omit() |> 
select(-c(2,10,11,16,17)) |> 
select(last_col(), 1:29) |> 
rename(coder_id=V35, content_id=V1)
# content_id for both coders
data$some = data$content_id
# clean data
data$V18 = data$V18 |> str_extract("^\\d+") |> as.numeric()

analyze data

# 
write.table(data, "../csv/haidi-dk.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_DK001 V3_DK002 V3_DK003 V3_DK004 V3_DK005 V3_DK006 V3_DK007 V3_DK…¹ V3_DK…²
     <int>    <int>    <int>    <int>    <int>    <int>    <int>   <int>   <int>
1      208      206      208      206      206      206      206     206     206
2      208      206      208      206      206      206      206     206     206
# … with 271 more variables: V3_DK010 <int>, V4_DK001 <int>, V4_DK002 <int>,
#   V4_DK003 <int>, V4_DK004 <int>, V4_DK005 <int>, V4_DK006 <int>,
#   V4_DK007 <int>, V4_DK008 <int>, V4_DK009 <int>, V4_DK010 <int>,
#   V5_DK001 <int>, V5_DK002 <int>, V5_DK003 <int>, V5_DK004 <int>,
#   V5_DK005 <int>, V5_DK006 <int>, V5_DK007 <int>, V5_DK008 <int>,
#   V5_DK009 <int>, V5_DK010 <int>, V6_DK001 <int>, V6_DK002 <int>,
#   V6_DK003 <int>, V6_DK004 <int>, V6_DK005 <int>, V6_DK006 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.639 
# write data
fn = "../csv/haidi-wp1-coding-dk-wide.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

230515: swedish dataset

# load dataset
fn = "../csv/some.tsv"
data = read.table(fn, sep='\t', header=F, strip.white=TRUE, stringsAsFactors=FALSE)
data = read.table(fn, sep='\t', header=F, quote="", strip.white=TRUE, stringsAsFactors=FALSE)
# select data
data = data |> as_tibble() |> select(-c(2,10,11,16,17)) |> select(last_col(), 1:29) |> rename(coder_id=V35, content_id=V1)
# duplicate content_id for both coders
data$some = rep(data$content_id[data$coder_id=="A"], 2)

230516: analyze data

# 
write.table(data, "../csv/haidi-se.tsv", sep="\t", quot=T, row.names=F)
# transform data to wide (10 content units * 28 content vars)
data = data |> 
pivot_wider(id_cols=coder_id, names_from=some, values_from=c(3:30)) |> 
select(-coder_id)
# 
data
# A tibble: 2 × 280
  V3_S001 V3_S002 V3_S003 V3_S004 V3_S005 V3_S006 V3_S007 V3_S008 V3_S009
    <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>   <int>
1     102     103     102     101     104     102     103     101     101
2     102     103     102     101     104     102     103     101     101
# … with 271 more variables: V3_S010 <int>, V4_S001 <int>, V4_S002 <int>,
#   V4_S003 <int>, V4_S004 <int>, V4_S005 <int>, V4_S006 <int>, V4_S007 <int>,
#   V4_S008 <int>, V4_S009 <int>, V4_S010 <int>, V5_S001 <int>, V5_S002 <int>,
#   V5_S003 <int>, V5_S004 <int>, V5_S005 <int>, V5_S006 <int>, V5_S007 <int>,
#   V5_S008 <int>, V5_S009 <int>, V5_S010 <int>, V6_S001 <int>, V6_S002 <int>,
#   V6_S003 <int>, V6_S004 <int>, V6_S005 <int>, V6_S006 <int>, V6_S007 <int>,
#   V6_S008 <int>, V6_S009 <int>, V6_S010 <int>, V7_S001 <int>, …
# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 280 
   Raters = 2 
    alpha = 0.784 
# write data
fn = "../csv/test.csv"
write.table(data, fn, sep="\t", quot=T, row.names=F)
  • dataset wide format link

230426: sample dataset

# load dataset
data = readxl::read_excel('../csv/some.xlsx', sheet="Blad1", col_names=paste0("x", seq(34)))
# select data
data = data |> 
mutate(coder_id=c(rep("a",5), rep("b",5)), content_id=rep(seq(5), 2)) |> 
select("coder_id","content_id",1,3,9,10) |> print(n=100)
# A tibble: 10 × 6
   coder_id content_id x1       x3    x9 x10                                    
   <chr>         <int> <chr> <dbl> <dbl> <chr>                                  
 1 a                 1 S001    102    99 Dolda larmsiffrorna: Så dåligt mår 85-…
 2 a                 2 S002    103    99 Satsningar som räddar liv              
 3 a                 3 S003    102    99 Detta måste ni rätta till i vården, po…
 4 a                 4 S004    101    99 Sju utmaningar - därför är det kris i …
 5 a                 5 S005    104    99 De kommande årens satsningar sker i pr…
 6 b                 1 S006    102     0 De har full koll på senioren           
 7 b                 2 S007    103    99 Mossig kritik mot vårdappar            
 8 b                 3 S008    101    99 Folksjukdomar som kan förvärras i spår…
 9 b                 4 S009    101    99 Så vill regeringen möta utmaningarna i…
10 b                 5 S010    105    99 Tekniken ska avlasta personalen        
# transform data
data = data |> 
pivot_wider(id_cols=coder_id, names_from=content_id, values_from=x3) |> 
select(-coder_id)
# https://rpubs.com/jacoblong/content-analysis-krippendorff-alpha-R
data
# A tibble: 2 × 5
    `1`   `2`   `3`   `4`   `5`
  <dbl> <dbl> <dbl> <dbl> <dbl>
1   102   103   102   101   104
2   102   103   101   101   105

230427: analyze data

# inter rater reliability
data = as.matrix(data)
# specify data type: nominal, ordinal, interval, ratio 
kripp.alpha(data, method="nominal")
 Krippendorff's alpha

 Subjects = 5 
   Raters = 2 
    alpha = 0.526 
  • Krippendorff’s Alpha values range from -1 to 1, with 1 representing unanimous agreement between the raters, 0 indicating they’re guessing randomly, and negative values suggesting the raters are systematically disagreeing. As suggested by Krippendorff, alphas above 0.8 are considered very good agreement, and tentative conclusions can be made with data where α≥0.667

sample data

# get some data
data <-
  tribble(
    ~content_id, ~coder_id, ~var1, ~var2,   ~var3,
    1,           "A",       1,     "Red",   FALSE,
    2,           "A",       3,     "Blue",  TRUE,
    3,           "A",       5,     "Blue",  TRUE,
    4,           "A",       7,     "Green", TRUE,
    5,           "A",       1,     "Red",   FALSE,
    1,           "B",       1,     "Red",   FALSE,
    2,           "B",       3,     "Blue",  FALSE,
    3,           "B",       3,     "Green", FALSE,
    4,           "B",       7,     "Green", TRUE,
    5,           "B",       3,     "Red",   FALSE,
  )

data |> print(n=100)
# A tibble: 10 × 5
   content_id coder_id  var1 var2  var3 
        <dbl> <chr>    <dbl> <chr> <lgl>
 1          1 A            1 Red   FALSE
 2          2 A            3 Blue  TRUE 
 3          3 A            5 Blue  TRUE 
 4          4 A            7 Green TRUE 
 5          5 A            1 Red   FALSE
 6          1 B            1 Red   FALSE
 7          2 B            3 Blue  FALSE
 8          3 B            3 Green FALSE
 9          4 B            7 Green TRUE 
10          5 B            3 Red   FALSE

exclude

if (T) {
knitr::knit_exit()
}